*
** CEX numbers
* Original author: David Zhang
* Original date: Unknown
* Auditor/Editor: Jason Premo
* Date of audit: 7/18/16
*
* Nearly all comments are by Jason Premo (this file was basically uncommented before).
*
* Description: Creates the CEX numbers for the survey comparison table in the SST paper. 
* This file has been completely reworked, as the old file used the wrong data year,
* and additionally did not properly apply weights to the relevant subpopulations
* from the expenditure files.
*
* Outputs to: Townsend/results/balance_sheet.xslx
*
* Filepath: /shared/sdsjxs/Townsend/scripts/cex.do

clear all
set more off
cap log close

local in_path "/shared/sdsjxs/Townsend/rawdata/cex"
local out_path "/shared/sdsjxs/Townsend/results"
local data_gen no


******************************** Townsend *************************************

*** Create a master file
*FMLI files
use "`in_path'/fmli121x.dta", clear
gen qyear = "20121"
append using "`in_path'/fmli122.dta"
replace qyear = "20122" if missing(qyear)
append using "`in_path'/fmli123.dta"
replace qyear = "20123" if missing(qyear)
append using "`in_path'/fmli124.dta"
replace qyear = "20124" if missing(qyear)
append using "`in_path'/fmli131.dta"
replace qyear = "20131" if missing(qyear)

*This loop replaces the A-coded missing values with zeros, which is consistent
*with taking the sort of average that we are interested in.
*I have confirmed this over the phone with the folks at the BLS.
qui describe, varlist
foreach item in `r(varlist)' {
	local type = substr("`: type `item''", 1, 3)
	local firstfour = substr("`item'", 1, 4)
	local lastthree = reverse(substr(reverse("`item'"), 1, 3))
	
	if "`type'" == "str" {
		continue
		}
	else {
		local missing_var1 = "`firstfour'_`lastthree'"
		local missing_var2 = "`item'_"
		cap replace `item' = 0 if `missing_var1' == "A"
		cap replace `item' = 0 if `missing_var2' == "A"
		}
	} 

rename finlwt21 weight


tempfile fmli
save `fmli', replace


*MEMI files
clear
use "`in_path'/memi121x.dta", clear
append using "`in_path'/memi122.dta"
append using "`in_path'/memi123.dta"
append using "`in_path'/memi124.dta"
append using "`in_path'/memi131.dta"

*Only care about employment taxes
replace anfedtxm = 0 if anfe_txm == "A"
replace ansltxm = 0 if ansltxm_ == "A"
egen emply_tax = rowtotal(anfedtxm ansltxm)

collapse (sum) emply_tax, by(newid)

merge 1:1 newid using `fmli', keepusing(weight) nogen

tempfile memi
save `memi', replace


*Only really need to run this once. It takes a good while.
if "`data_gen'" == "yes" {
	*Create a backup and remove the old file
	cd "/shared/sdsjxs/Townsend/rawdata/cex/expn"
	shell rm ../expncollected_backup.dta
	shell mv ../expncollected.dta ../expncollected_backup.dta

	*creating the local with all of the files in the folder
	local allfiles :  dir . files "*.dta"
*looping through
	foreach f in `allfiles' {		
		display "Starting on file `f'"
		use "`f'", clear
		gen filename = "`f'"
		capture append using ../expncollected.dta
		save ../expncollected.dta, replace
		}
	
	qui describe, varlist
	foreach item in `r(varlist)' {
		local type = substr("`: type `item''", 1, 3)
		local firstfour = substr("`item'", 1, 4)
		local lastthree = reverse(substr(reverse("`item'"), 1, 3))
		
		if "`type'" == "str" {
			continue
		}
		else {
			local missing_var1 = "`firstfour'_`lastthree'"
			local missing_var2 = "`item'_"
			cap replace `item' = 0 if `missing_var1' == "A"
			cap replace `item' = 0 if `missing_var2' == "A"
			}
		} 
	}

** Because of the lack of overlap between the expenditure files, it will ultimately be impossible to extract
** medians for any metric that requires using them. So no asset or liability medians.

use "`in_path'/expncollected.dta", clear
* This merge should have no unmatched items
merge m:1 newid using `fmli', nogen

*tempfile start
*save `start', replace


******* Assets

*** NOTE: According to people at the BLS with whom I've been in email contact,
* asset and liability questions are only asked during interview 5. However...
* this clearly isn't the case for some variables. Below is my best attempt
* to reconcile what they say, what's in the questionnaire, and what's in the data.

** FMLI files

*This generates a subpopulation flag. These flags were originally used with the
* svy command below. However, this is no longer the case. I retain them because
* they may be of use later.
gen fmlisub = qyear == "20124"


*For checking/savings accounts, it's clear that the missing values
*should be genuinely treated as missing. This is not clear
*with some of the other variables.
replace ckbkactx = . if interi != 5
gen f_checking = ckbkactx
replace savacctx = . if interi != 5
gen f_savings = savacctx

* Stocks, private bonds, and mutual funds (altogether).
* Subtract out savings bonds to avoid double-counting.
replace secestx = . if interi!=5
replace secestx = 0 if secestx == . & interi == 5 & secestx_ == "A"
replace usbndx = . if interi != 5
replace usbndx = 0 if usbndx == . & interi == 5 & usbndx_ == "A"
gen f_stock = secestx - usbndx

*Savings bonds
gen f_svgs_bond = usbndx
*Money owed by others
replace monyowdx = . if interi != 5
gen f_money_owed = monyowdx


** Home values
*Use cutenure to identify which values are truly missing.
*This is not quite right, because theoretically somebody
*might own a vacation property and rent their primary residence...
*but I think this is the best we can get.
gen homevalsub = filename == "opi12.dta"
replace propvalx = 0 if inlist(cutenure, "4", "5", "6") & prop_alx == "A"
gen e_prim_home = propvalx if inlist(ownyi, "100")
gen e_realestate = propvalx if inlist(ownyi, "200", "300", "400", "500", "600")


******* Liabilities

*Credit cards and revolving store credit
gen creditsub = filename == "fna12.dta"
replace creditx5 = . if interi != 5
gen e_creditcard = creditx5 if substr(creditr5, 1, 1) == "1"
gen e_revolving_store = creditx5 if substr(creditr5, 1, 1) == "2"
gen e_other_credit = creditx5 if inlist(creditr5, "510", "700", "800")

*HELOCs
** This includes everybody who has a HELOC
gen helocsub = filename == "oph12.dta"
gen e_heloc=totowed
*These people definitely don't have helocs...
replace e_heloc = 0 if inlist(cutenure, "4", "5", "6")

*HELs
gen helsub = filename == "hel12.dta"
gen hel_flag = !missing(qprinm1x) | !missing(qprinm2x) | !missing(qprinm3x)
egen e_hel = rowmean(qprinm1x qprinm2x qprinm3x) if hel_flag == 1
replace e_hel = 0 if inlist(cutenure, "4", "5", "6")

*Mortgages
gen mortsub = filename == "mor12.dta"
gen mortgage_flag = !missing(qblncm1x) | !missing(qblncm2x) | !missing(qblncm3x)
egen mortgage_new = rowmean(qblncm*) if mortgage_flag == 1
replace mortgage_new = . if cutenure == "1" & mortgage_new == 0
replace mortgage_new = . if cutenure == "3"
gen e_prim_mort = mortgage_new if ownyf == "100"
gen e_other_mort = mortgage_new if ownyf != "100"

*Vehicle loans
gen vehsub = filename == "ovb12.dta"
gen vehic_flag = !missing(qbalnm1x) | !missing(qbalnm2x) | !missing(qbalnm3x)
egen e_vehic_loan = rowmean(qbalnm*) if vehic_flag == 1



******* Income
*After much searching, I have learned that varnamex is the raw income variable, while varnamem (or some variation)
* is the mean-imputed version. See the imputation guide.
*The variables which end in M are the ones we want, according to the documentation. 

*Relevant categories:
** fincbtxm -- Final income before taxes (sum of other variables)
** fsalarym -- Wages or salaries (the documentation reports this as fsalaryxm. That variable doesn't exist. I think it's a typo)
** fnonfrmm -- Business income (non-farm)
** ffrmincm -- Farm income
** finincxm -- Dividends, royalties, estates, or trusts (the documentation reports this as finincm. Another typo.)
** frretirm -- Social security and railroad retirement income
** fssixm -- SSI income
** unemplxm -- Unemployment income
** compensm -- Workers comp and veteran's benefits
** welfarem -- Public assistance/welfare
** intearnm -- Interest on savings accounts and bonds
** pensionm -- Pension income
** inclosam -- Roomers/boarders
** inclosbm -- Renters
** aliothxm -- Alimony and other transfers from persons
** chdothxm -- Child support (non-lump sum)
** othrincm -- Other income (scholarships, fellowship, non-work stpends, care of foster children)
** foodsmpm -- Food stamps

*Categorize

gen f_total_inc = fincbtxm 
gen f_total_cen_inc = f_total_inc
replace f_total_cen_inc = 200000 if f_total_cen_inc > 200000
gen f_wage_inc = fsalarym 
egen f_rent_inc = rowtotal(inclosam inclosbm)
egen f_business_inc = rowtotal(fnonfrmm ffrmincm)
egen f_intdiv_inc = rowtotal(finincxm intearnm)
*Note: govtran includes workers comp, because we don't have a way of disaggregating it.
egen f_govtran_inc = rowtotal(frretirm fssixm unemplxm compensm welfarem foodsmpm)
egen f_othr_tran_pers_inc = rowtotal(aliothxm chdothxm)
egen f_othr_inc = rowtotal(pensionm othrincm)


**** Expenditures
** See page 22 of the survey guide for the PQ vs CQ distinction.
** The important point is that the CQ (current quarter) variables
** and the PQ (past quarter) variables actually sum to a single
** calendar quarter. So what's below is correct.

gen f_tot_exp = (totexppq + totexpcq)*4 
gen f_foodalcohol = (foodcq + alcbevcq + foodpq + alcbevpq)*4
gen f_apparel = (apparcq + apparpq)*4
gen f_housing = (houscq + houspq)*4
gen f_health = (healthcq + healthpq)*4
gen f_transportation = (transcq + transpq)*4
gen f_entertainment = (entertcq + entertpq)*4
gen f_education = (educacq + educapq)*4
gen f_reading = (readcq + readpq)*4
gen f_insurance = (perinscq + perinspq)*4
gen f_personalcare = (perscacq + perscapq)*4
gen f_tobacco = (tobacccq + tobaccpq)*4
gen f_misc = (misccq + miscpq)*4
gen f_cashcontrib = (cashcocq + cashcopq)*4
gen f_proptax = (proptxcq + proptxpq)*4

*The variables from the expenditure files need to be summed, while the variables
*from the fmli file(s) need to be averaged.
collapse (mean) weight f_* (max) *sub (sum) e_*, by(newid) fast

svyset newid [pw=weight]

tempfile start
save `start', replace


*** Asset means

local f_assets f_checking f_savings f_stock f_svgs_bond f_money_owed
local e_assets e_prim_home e_realestate

local i = 0
foreach asset in `f_assets' {
	svy: mean `asset'
	if "`i'" == "0" {
		mat assets = e(b)'
		}
	else {
		mat assets`i' = e(b)'
		mat assets = assets\assets`i'
		}
	local ++i
	}

foreach asset in `e_assets' {
	svy: mean `asset'
	mat assets`i' = e(b)'
	mat assets = assets\assets`i'
	local ++i
	}


clear

svmat2 assets, rnames(varname)
order varname
replace assets1 = round(assets1)

*May not be possible to create medians from this data.

export excel using "`out_path'/balance_sheet.xlsx", sheet("cex_assets", replace) 



use `start', clear


*** Liability means
*The subpop is only used for credit, as the hel and helocs are (according to the BLS folks) an exhuastive
* list of HEL and HELOC people in the sample. The other variables are also appropriately filtered.
svy, subpop(creditsub): mean e_creditcard e_revolving_store e_other_credit
mat liabilities = e(b)'


svy: mean e_hel
mat liabilitiesa = e(b)'
svy: mean e_heloc
mat liabilitiesb = e(b)'
mat liabilities2 = liabilitiesa + liabilitiesb

mat liabilities = liabilities\liabilities2

svy: mean e_prim_mort e_other_mort
mat liabilities3 = e(b)'
mat liabilities = liabilities\liabilities3

svy: mean e_vehic_loan
mat liabilities4 = e(b)'
mat liabilities = liabilities\liabilities4

clear

svmat2 liabilities, rnames(varname)
order varname
replace liabilities1 = round(liabilities1)

*May not be possible to create medians from this data.

export excel using "`out_path'/balance_sheet.xlsx", sheet("cex_liabilities", replace) 


use `start', clear

*** Income means
*Median total income first
preserve
collapse (median) income1 = f_total_inc [pw=weight], fast
gen varname = "med_total_inc"
tempfile inc_med
save `inc_med', replace
restore

*Now means
local f_income f_total_inc f_total_cen_inc f_wage_inc f_rent_inc f_business_inc f_intdiv_inc f_govtran_inc f_othr_tran_pers_inc f_othr_inc

local i = 0
foreach source in `f_income' {
	svy: mean `source'
	if "`i'" == "0" {
		mat income = e(b)'
		}
	else {
		mat income`i' = e(b)'
		mat income = income\income`i'
		}
	local ++i
	}


clear

svmat2 income, rnames(varname)
order varname
append using `inc_med'
replace income1 = round(income1)


export excel using "`out_path'/balance_sheet.xlsx", sheet("cex_income", replace) 



use `start', clear

*** Expenditure means
*Median first
preserve
collapse (median) exp1 = f_tot_exp [pw=weight], fast
gen varname = "med_total_exp"
tempfile exp_med
save `exp_med', replace
restore

*Just employment taxes
preserve
use `memi', clear
collapse (mean) exp1 = emply_tax [pweight = weight]
gen varname = "emply_tax"
tempfile emply_tax
save `emply_tax'
restore

*Means
local f_exp f_tot_exp f_foodalcohol f_apparel f_housing f_health f_transportation f_entertainment f_education f_reading f_insurance f_personalcare f_tobacco f_misc f_cashcontrib f_proptax

local i = 0
foreach source in `f_exp' {
	svy: mean `source'
	if "`i'" == "0" {
		mat exp = e(b)'
		}
	else {
		mat exp`i' = e(b)'
		mat exp = exp\exp`i'
		}
	local ++i
	}

clear

svmat2 exp, rnames(varname)
order varname
append using `exp_med'
append using `emply_tax'
replace exp1 = round(exp1)

* Creating medians will require fweights, recalculated weights, or maybe clever use of the collapse command. Have to think about this a bit.

export excel using "`out_path'/balance_sheet.xlsx", sheet("cex_expenditure", replace) 



